{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 6.4 \u589e\u91cf\u5f0f\u89e3\u6790\u5927\u578bXML\u6587\u4ef6\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### \u95ee\u9898\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u4f60\u60f3\u4f7f\u7528\u5c3d\u53ef\u80fd\u5c11\u7684\u5185\u5b58\u4ece\u4e00\u4e2a\u8d85\u5927\u7684XML\u6587\u6863\u4e2d\u63d0\u53d6\u6570\u636e\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### \u89e3\u51b3\u65b9\u6848\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u4efb\u4f55\u65f6\u5019\u53ea\u8981\u4f60\u9047\u5230\u589e\u91cf\u5f0f\u7684\u6570\u636e\u5904\u7406\u65f6\uff0c\u7b2c\u4e00\u65f6\u95f4\u5c31\u5e94\u8be5\u60f3\u5230\u8fed\u4ee3\u5668\u548c\u751f\u6210\u5668\u3002\n\u4e0b\u9762\u662f\u4e00\u4e2a\u5f88\u7b80\u5355\u7684\u51fd\u6570\uff0c\u53ea\u4f7f\u7528\u5f88\u5c11\u7684\u5185\u5b58\u5c31\u80fd\u589e\u91cf\u5f0f\u7684\u5904\u7406\u4e00\u4e2a\u5927\u578bXML\u6587\u4ef6\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from xml.etree.ElementTree import iterparse\n\ndef parse_and_remove(filename, path):\n    path_parts = path.split('/')\n    doc = iterparse(filename, ('start', 'end'))\n    # Skip the root element\n    next(doc)\n\n    tag_stack = []\n    elem_stack = []\n    for event, elem in doc:\n        if event == 'start':\n            tag_stack.append(elem.tag)\n            elem_stack.append(elem)\n        elif event == 'end':\n            if tag_stack == path_parts:\n                yield elem\n                elem_stack[-2].remove(elem)\n            try:\n                tag_stack.pop()\n                elem_stack.pop()\n            except IndexError:\n                pass"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u4e3a\u4e86\u6d4b\u8bd5\u8fd9\u4e2a\u51fd\u6570\uff0c\u4f60\u9700\u8981\u5148\u6709\u4e00\u4e2a\u5927\u578b\u7684XML\u6587\u4ef6\u3002\n\u901a\u5e38\u4f60\u53ef\u4ee5\u5728\u653f\u5e9c\u7f51\u7ad9\u6216\u516c\u5171\u6570\u636e\u7f51\u7ad9\u4e0a\u627e\u5230\u8fd9\u6837\u7684\u6587\u4ef6\u3002\n\u4f8b\u5982\uff0c\u4f60\u53ef\u4ee5\u4e0b\u8f7dXML\u683c\u5f0f\u7684\u829d\u52a0\u54e5\u57ce\u5e02\u9053\u8def\u5751\u6d3c\u6570\u636e\u5e93\u3002\n\u5728\u5199\u8fd9\u672c\u4e66\u7684\u65f6\u5019\uff0c\u4e0b\u8f7d\u6587\u4ef6\u5df2\u7ecf\u5305\u542b\u8d85\u8fc7100,000\u884c\u6570\u636e\uff0c\u7f16\u7801\u683c\u5f0f\u7c7b\u4f3c\u4e8e\u4e0b\u9762\u8fd9\u6837\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "<response>\n    <row>\n        <row ...>\n            <creation_date>2012-11-18T00:00:00</creation_date>\n            <status>Completed</status>\n            <completion_date>2012-11-18T00:00:00</completion_date>\n            <service_request_number>12-01906549</service_request_number>\n            <type_of_service_request>Pot Hole in Street</type_of_service_request>\n            <current_activity>Final Outcome</current_activity>\n            <most_recent_action>CDOT Street Cut ... Outcome</most_recent_action>\n            <street_address>4714 S TALMAN AVE</street_address>\n            <zip>60632</zip>\n            <x_coordinate>1159494.68618856</x_coordinate>\n            <y_coordinate>1873313.83503384</y_coordinate>\n            <ward>14</ward>\n            <police_district>9</police_district>\n            <community_area>58</community_area>\n            <latitude>41.808090232127896</latitude>\n            <longitude>-87.69053684711305</longitude>\n            <location latitude=\"41.808090232127896\"\n            longitude=\"-87.69053684711305\" />\n        </row>\n        <row ...>\n            <creation_date>2012-11-18T00:00:00</creation_date>\n            <status>Completed</status>\n            <completion_date>2012-11-18T00:00:00</completion_date>\n            <service_request_number>12-01906695</service_request_number>\n            <type_of_service_request>Pot Hole in Street</type_of_service_request>\n            <current_activity>Final Outcome</current_activity>\n            <most_recent_action>CDOT Street Cut ... Outcome</most_recent_action>\n            <street_address>3510 W NORTH AVE</street_address>\n            <zip>60647</zip>\n            <x_coordinate>1152732.14127696</x_coordinate>\n            <y_coordinate>1910409.38979075</y_coordinate>\n            <ward>26</ward>\n            <police_district>14</police_district>\n            <community_area>23</community_area>\n            <latitude>41.91002084292946</latitude>\n            <longitude>-87.71435952353961</longitude>\n            <location latitude=\"41.91002084292946\"\n            longitude=\"-87.71435952353961\" />\n        </row>\n    </row>\n</response>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u5047\u8bbe\u4f60\u60f3\u5199\u4e00\u4e2a\u811a\u672c\u6765\u6309\u7167\u5751\u6d3c\u62a5\u544a\u6570\u91cf\u6392\u5217\u90ae\u7f16\u53f7\u7801\u3002\u4f60\u53ef\u4ee5\u50cf\u8fd9\u6837\u505a\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from xml.etree.ElementTree import parse\nfrom collections import Counter\n\npotholes_by_zip = Counter()\n\ndoc = parse('potholes.xml')\nfor pothole in doc.iterfind('row/row'):\n    potholes_by_zip[pothole.findtext('zip')] += 1\nfor zipcode, num in potholes_by_zip.most_common():\n    print(zipcode, num)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u8fd9\u4e2a\u811a\u672c\u552f\u4e00\u7684\u95ee\u9898\u662f\u5b83\u4f1a\u5148\u5c06\u6574\u4e2aXML\u6587\u4ef6\u52a0\u8f7d\u5230\u5185\u5b58\u4e2d\u7136\u540e\u89e3\u6790\u3002\n\u5728\u6211\u7684\u673a\u5668\u4e0a\uff0c\u4e3a\u4e86\u8fd0\u884c\u8fd9\u4e2a\u7a0b\u5e8f\u9700\u8981\u7528\u5230450MB\u5de6\u53f3\u7684\u5185\u5b58\u7a7a\u95f4\u3002\n\u5982\u679c\u4f7f\u7528\u5982\u4e0b\u4ee3\u7801\uff0c\u7a0b\u5e8f\u53ea\u9700\u8981\u4fee\u6539\u4e00\u70b9\u70b9\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from collections import Counter\n\npotholes_by_zip = Counter()\n\ndata = parse_and_remove('potholes.xml', 'row/row')\nfor pothole in data:\n    potholes_by_zip[pothole.findtext('zip')] += 1\nfor zipcode, num in potholes_by_zip.most_common():\n    print(zipcode, num)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u7ed3\u679c\u662f\uff1a\u8fd9\u4e2a\u7248\u672c\u7684\u4ee3\u7801\u8fd0\u884c\u65f6\u53ea\u9700\u89817MB\u7684\u5185\u5b58\u2013\u5927\u5927\u8282\u7ea6\u4e86\u5185\u5b58\u8d44\u6e90\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### \u8ba8\u8bba\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u8fd9\u4e00\u8282\u7684\u6280\u672f\u4f1a\u4f9d\u8d56 ElementTree \u6a21\u5757\u4e2d\u7684\u4e24\u4e2a\u6838\u5fc3\u529f\u80fd\u3002\n\u7b2c\u4e00\uff0citerparse() \u65b9\u6cd5\u5141\u8bb8\u5bf9XML\u6587\u6863\u8fdb\u884c\u589e\u91cf\u64cd\u4f5c\u3002\n\u4f7f\u7528\u65f6\uff0c\u4f60\u9700\u8981\u63d0\u4f9b\u6587\u4ef6\u540d\u548c\u4e00\u4e2a\u5305\u542b\u4e0b\u9762\u4e00\u79cd\u6216\u591a\u79cd\u7c7b\u578b\u7684\u4e8b\u4ef6\u5217\u8868\uff1a\nstart , end, start-ns \u548c end-ns \u3002\n\u7531 iterparse() \u521b\u5efa\u7684\u8fed\u4ee3\u5668\u4f1a\u4ea7\u751f\u5f62\u5982 (event, elem) \u7684\u5143\u7ec4\uff0c\n\u5176\u4e2d event \u662f\u4e0a\u8ff0\u4e8b\u4ef6\u5217\u8868\u4e2d\u7684\u67d0\u4e00\u4e2a\uff0c\u800c elem \u662f\u76f8\u5e94\u7684XML\u5143\u7d20\u3002\u4f8b\u5982\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "data = iterparse('potholes.xml',('start','end'))\nnext(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "next(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "next(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "next(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "next(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "next(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "next(data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "start \u4e8b\u4ef6\u5728\u67d0\u4e2a\u5143\u7d20\u7b2c\u4e00\u6b21\u88ab\u521b\u5efa\u5e76\u4e14\u8fd8\u6ca1\u6709\u88ab\u63d2\u5165\u5176\u4ed6\u6570\u636e(\u5982\u5b50\u5143\u7d20)\u65f6\u88ab\u521b\u5efa\u3002\n\u800c end \u4e8b\u4ef6\u5728\u67d0\u4e2a\u5143\u7d20\u5df2\u7ecf\u5b8c\u6210\u65f6\u88ab\u521b\u5efa\u3002\n\u5c3d\u7ba1\u6ca1\u6709\u5728\u4f8b\u5b50\u4e2d\u6f14\u793a\uff0c start-ns \u548c end-ns \u4e8b\u4ef6\u88ab\u7528\u6765\u5904\u7406XML\u6587\u6863\u547d\u540d\u7a7a\u95f4\u7684\u58f0\u660e\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u8fd9\u672c\u8282\u4f8b\u5b50\u4e2d\uff0c start \u548c end \u4e8b\u4ef6\u88ab\u7528\u6765\u7ba1\u7406\u5143\u7d20\u548c\u6807\u7b7e\u6808\u3002\n\u6808\u4ee3\u8868\u4e86\u6587\u6863\u88ab\u89e3\u6790\u65f6\u7684\u5c42\u6b21\u7ed3\u6784\uff0c\n\u8fd8\u88ab\u7528\u6765\u5224\u65ad\u67d0\u4e2a\u5143\u7d20\u662f\u5426\u5339\u914d\u4f20\u7ed9\u51fd\u6570 parse_and_remove() \u7684\u8def\u5f84\u3002\n\u5982\u679c\u5339\u914d\uff0c\u5c31\u5229\u7528 yield \u8bed\u53e5\u5411\u8c03\u7528\u8005\u8fd4\u56de\u8fd9\u4e2a\u5143\u7d20\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u5728 yield \u4e4b\u540e\u7684\u4e0b\u9762\u8fd9\u4e2a\u8bed\u53e5\u624d\u662f\u4f7f\u5f97\u7a0b\u5e8f\u5360\u7528\u6781\u5c11\u5185\u5b58\u7684ElementTree\u7684\u6838\u5fc3\u7279\u6027\uff1a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "elem_stack[-2].remove(elem)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u8fd9\u4e2a\u8bed\u53e5\u4f7f\u5f97\u4e4b\u524d\u7531 yield \u4ea7\u751f\u7684\u5143\u7d20\u4ece\u5b83\u7684\u7236\u8282\u70b9\u4e2d\u5220\u9664\u6389\u3002\n\u5047\u8bbe\u5df2\u7ecf\u6ca1\u6709\u5176\u5b83\u7684\u5730\u65b9\u5f15\u7528\u8fd9\u4e2a\u5143\u7d20\u4e86\uff0c\u90a3\u4e48\u8fd9\u4e2a\u5143\u7d20\u5c31\u88ab\u9500\u6bc1\u5e76\u56de\u6536\u5185\u5b58\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u5bf9\u8282\u70b9\u7684\u8fed\u4ee3\u5f0f\u89e3\u6790\u548c\u5220\u9664\u7684\u6700\u7ec8\u6548\u679c\u5c31\u662f\u4e00\u4e2a\u5728\u6587\u6863\u4e0a\u9ad8\u6548\u7684\u589e\u91cf\u5f0f\u6e05\u626b\u8fc7\u7a0b\u3002\n\u6587\u6863\u6811\u7ed3\u6784\u4ece\u59cb\u81ea\u7ec8\u6ca1\u88ab\u5b8c\u6574\u7684\u521b\u5efa\u8fc7\u3002\u5c3d\u7ba1\u5982\u6b64\uff0c\u8fd8\u662f\u80fd\u901a\u8fc7\u4e0a\u8ff0\u7b80\u5355\u7684\u65b9\u5f0f\u6765\u5904\u7406\u8fd9\u4e2aXML\u6570\u636e\u3002"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\u8fd9\u79cd\u65b9\u6848\u7684\u4e3b\u8981\u7f3a\u9677\u5c31\u662f\u5b83\u7684\u8fd0\u884c\u6027\u80fd\u4e86\u3002\n\u6211\u81ea\u5df1\u6d4b\u8bd5\u7684\u7ed3\u679c\u662f\uff0c\u8bfb\u53d6\u6574\u4e2a\u6587\u6863\u5230\u5185\u5b58\u4e2d\u7684\u7248\u672c\u7684\u8fd0\u884c\u901f\u5ea6\u5dee\u4e0d\u591a\u662f\u589e\u91cf\u5f0f\u5904\u7406\u7248\u672c\u7684\u4e24\u500d\u5feb\u3002\n\u4f46\u662f\u5b83\u5374\u4f7f\u7528\u4e86\u8d85\u8fc7\u540e\u800560\u500d\u7684\u5185\u5b58\u3002\n\u56e0\u6b64\uff0c\u5982\u679c\u4f60\u66f4\u5173\u5fc3\u5185\u5b58\u4f7f\u7528\u91cf\u7684\u8bdd\uff0c\u90a3\u4e48\u589e\u91cf\u5f0f\u7684\u7248\u672c\u5b8c\u80dc\u3002"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.1"
    },
    "toc": {
      "base_numbering": 1,
      "nav_menu": {},
      "number_sections": true,
      "sideBar": true,
      "skip_h1_title": true,
      "title_cell": "Table of Contents",
      "title_sidebar": "Contents",
      "toc_cell": false,
      "toc_position": {},
      "toc_section_display": true,
      "toc_window_display": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}